from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
import numpy as np
import time
import codecs

k = 100
countIter = 100
NUMTERMS = 20
lmd1 = 3e-5
lmd2 = 0
sparse = 1e-5
save_file = "result1/JXL_"
file = open("result1/data1.txt")
corpus = file.xreadlines()
# for line in file:
# 	corpus.append(line);
# file.close()
# vectorizer=CountVectorizer()
tfidf_vectorizer = TfidfVectorizer(min_df=8)
transformer=TfidfTransformer()
tfidf=transformer.fit_transform(tfidf_vectorizer.fit_transform(corpus))
# tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus));
word = tfidf_vectorizer.get_feature_names();
weight = tfidf.toarray();
D = np.matrix(weight);
print type(word[0])
m,n = D.shape
print D.shape
U = np.matrix(np.random.rand(m,k))
V = np.matrix(np.random.rand(k,n))

def show_top_terms(W, m, n, k, terms):
	'''displays the top NUMTERMS terms for each cluster, k'''
	for c in range(k):
		# populate a dict with term:membership
		# as a key:value pair for each term in this cluster
		toptermsd = {}
		for t in range(len(W)):
			toptermsd[terms[t]] = W[t][c]

		# sort the terms into a list of tuples, ordered by value
		# (cluster membership)
		topterms = sorted(toptermsd.items(), key=lambda x: x[1])
		toptermsd.clear()
		# print the last NUMTERMS terms
		print("\nTopic %d:" % (c + 1))
		for j in range(1, NUMTERMS + 1):
			print u"\t%s" % topterms[-j][0] + u"\t(%.12f)" % (topterms[-j][1])

def updateU():
	R = (D * V.T - lmd2) * ((V * V.T).I)
	R = R.clip(0,np.inf)
	R = R/R.sum(1)
	return R

def updateV():
	R = (U.T * U - lmd1 * (V * V.T).I).I * (U.T * D)
	R = R.clip(0,np.inf)
	R = R/R.sum(1)
	return R

t0 = time.time()
for i in range(countIter):
	U = updateU()
	V = updateV()
	print "iteration %d :%f"%(i,time.time()-t0)
	if((i+1)%100==0):
		# print V
		# W=np.array(V.T)
		# show_top_terms(W,m,n,k,word)
		np.savetxt(save_file+"topic_"+str(k)+"_iter"+str(i+1)+"-U.txt",U,delimiter=',')
		np.savetxt(save_file+str(i+1)+"-V.txt",V,delimiter=',')

file = open("result1/word.txt","w")
for w in word:
	file.write(w.encode("utf-8")+"\n")
file.close()